/**********************************************************************/
/*
   Author: Robbie Dulin
   Created: 3 May 2021
   Updated: Apr 2024
   Description: Cleans raw August 2020 sakernas data.

   This cleaning file should output 2 different datasets:
   1. Cleaned SAKERNAS data:
   sak_aug20_deid_clean
   2. Subset of SAKERNAS person-batch data matched with PMO for SAKERNAS analysis:
   sak_aug20_deid_clean_merged
*/
/**********************************************************************/

*******************************************
* Setup

* include filepaths 
  if "$master_run" !="1" include "./Do/SET_FILEPATHS.do"
* Log
  cap log close
  global prefix: display %tdCYND td(`c(current_date)')
  log using "$KP_logs/${prefix}_clean_SAKERNAS_aug2020.txt", text replace
  clear
  set more off

  u "$KP_deid_sakernas/Raw/SAKERNAS_PRAKERJA_20AUG_deid.dta", clear
  rename final_weig weight

/*----------------------------------------------------*/
              /* Section: Demographics */
/*----------------------------------------------------*/

* create HH id
  egen hh_id = group(id_nks no_dsrt)
  summ hh_id
  rename hh_id hh_id_s // got this from Nikhil's code, not sure why
  di `r(N)' / `r(max)'

* birth year
  gen year_dob_sak = k5_th
  tab year_dob_sak
  replace year_dob_sak = . if year_dob_sak == 9999

* birth month
  gen month_dob_sak = k5_bl
  tab month_dob_sak
  replace month_dob_sak = . if month_dob_sak == 99

* age
  summ k6
  gen age_sak = k6
  recode age_sak (0/20 = 1) (21/40 = 2) (41/60 = 3) (61/80 = 4) (81/100 = 5), gen(age_cat_sak)

* Dummy for age <=30 
	gen young = 0 
	replace young = 1 if age_sak <= 30
	la def young 0 "Over 30 Years Old" 1 "30 and Under"
	la val young young 

* number of HH members
  gen hh_size_sak = jlh_art
  gen hh_size_sak_5 = jlh_art- jlh_art5

* relationship to HH head
  tab k3 , m
  gen relation_hh_head = k3
  la def relations 1 "HH head" 2 "Spouse" 3 "Son/Daughter" 4 "Step/adopted child" 5 "Son/Daughter-in-law" 6 "Grandchild" 7 "Parent/Parent-in-law" 8 "Other family" 9 "Housemaid" 10 "Driver/Gardener" 11 "No relation", replace
  la val relation_hh_head relations
  tab relation_hh_head

* order in roster
  isid urutan
  bysort hh_id_s (urutan): gen roster_first = _n == 1
  tab relation_hh_head roster_first

* marital status
  tab r4  , m
  gen married = r4 == 2
  gen divorced = r4 == 3
  gen widowed = r4 == 4
  gen single = r4 == 1

* gender
  tab k4 , m
  gen female = k4 == 2

* gender (version 2)
  gen gender = k4 == 1
  label def gender 1 "male"  0 "female"
  label val gender gender

* current student
  tab r5, m
  gen current_student = r5 == 2

* education levels
  tab r6a, m
  gen no_elementary = r6a == 1
  gen elementary = r6a == 2
  gen junior_high = r6a == 3
  gen high_school = r6a == 4 | r6a == 5
  gen tertiary = 6 <= r6a & r6a <= 8
  summ no_elementary elementary junior_high high_school tertiary

* education -> years of schooling
  gen educ = r6a
  gen school_years = 3 if educ == 1 // No elementary
  replace school_years = 6 if inlist(educ, 2) // elementary
  replace school_years = 9 if inlist(educ, 3) // junior high
  replace school_years = 12 if inlist(educ, 4, 5) // high (8-11)
  replace school_years = 14 if educ == 6 // Diploma I/II/III
  replace school_years = 16 if educ == 7 // Diploma IV
  replace school_years = 18 if educ == 8 // S1/S2/S3
  
* Dummy for educated (graduating high school)
  gen educated = school_years > 12 
  la def educated 0 "Less than 12 Years Schooling" 1 "12 or More Years of Schooling"
  la val educated educated

* training courses: ever done a training course with certificate?
  tab r6d, m
  gen train_certif = r6d == 1

  tab r6e, m
  gen current_training = r6e == 1

* live in the same kabupaten you were born in?
  gen born_kab = r7_kab // use kabupaten where person lived 5 years ago
  gen born_live_same = born_kab == kode_kab

* Prov 5 yrs ago
  gen born_prov = r7_prov // use kabupaten where person lived 5 years ago

* migration
  tab r7, m
  gen migrated = r7 == 2

* Migration. Place of residence now (prov/kab) != place of residence in 2015
  gen kab5 = r7_kab // use kabupaten where person lived 5 years ago
  gen migration = kode_kab != kab5

* disabilities
  tab r8a, m
  gen vision_disabled = r8a == 2 | r8a == 3
  gen vision_disability = r8a

  tab r8b, m
  gen hearing_disabled = r8b == 2 | r8b == 3
  gen hearing_disability = r8b

  tab r8c, m
  gen walk_disabled = r8c == 2 | r8c == 3
  gen walk_disability = r8c

  tab r8d, m
  gen hand_disabled = r8d == 2 | r8d == 3
  gen hand_disability = r8d

  tab r8e, m
  gen speech_disabled = r8e == 2 | r8e == 3
  gen speech_disability = r8e

  tab r8f, m
  gen other_disabled = r8f == 2 | r8f == 3
  gen other_disability = r8f
  summ *disabled

  gen any_disability = (vision_disabled + hearing_disabled + walk_disabled + hand_disabled + speech_disabled + other_disabled) >= 1
  tab any_disability

  gen severe_disability = r8a == 3 | r8b == 3 | r8c == 3 | r8d == 3 | r8e == 3 | r8f == 3
  tab severe_disability

* urban-rural
  tab klasifikas, m
  gen urban_sak = klasifikas == 1

* alternative urban/rural variable, check
  gen city_sak = kode_kab/10  > =7  & kode_kab/10 < 8
  gen city_sak_5 = born_kab/10  > =7  & born_kab/10 < 8

* java
  tab kode_prov
  gen java_sak = inrange(kode_prov, 31, 36)

* java 5 yrs ago
  tab born_prov
  gen java_sak_5 = inrange(born_prov, 31, 36)
/*----------------------------------------------------*/
                /* Section: Employment */
/*----------------------------------------------------*/

* employed
  * (worked at least 1 hour in past week)
  tab r9a, m
  gen employed = r9a == 1

  * did any income-generating activity
  tab r9b, m
  replace employed = 1 if r9b == 1

  * helped with work
  tab r9c, m
  replace employed = 1 if r9c == 1

  * temporarily not working
  tab r10a, m
  replace employed = 1 if r10a == 1
  tab employed, m

* employment status
  tab r12a, m
  gen employment_status = r12a
  label define status 0 "Not Working" 1 "Self-employed" 2 "Business owner with temporary/unpaid workers" 3 "Business owner with paid workers" 4 "Employee" 5 "Temporary worker in agriculture" 6 "Temporary worker (non-agriculture)" 7 "Family/unpaid worker", replace
  la val employment_status status
  tab employment_status, m

* Type of work: the categories including permanent job, self employed, business owner, etc (12A)
  gen self_employed = employment_status == 1
  gen business_owner = inlist(employment_status, 2, 3)
  gen perm_worker = employment_status == 4
  gen temp_worker = inlist(employment_status, 5, 6)
  gen family_unpaid = employment_status == 7
  gen self_emp_bus_owner = inlist(employment_status,1,2,3)
  
  * temporarily not working
  gen temp_not_work = r10a
  replace temp_not_work = 0 if temp_not_work!=1

* were you employed last week?
  replace employed = inlist(employment_status, 1, 2, 3, 4, 5, 6)
  tab employed, m
  tab employed employment_status, m row

* Business field
  // 17-sector classification: (https://www.bps.go.id/statictable/2016/01/06/1898/klasifikasi-17-sektor-tabel-input-output-indonesia-2010.html)
  tab r13a_kateg, m
  la def business_fields 1 "Agriculture, Forestry and Fisheries" 2 "Mining and excavation" 3 "Industrial Production" 4 "Electricity and Gas" ///
    5 "Water Supply, Waste Management, and Recycling" 6 "Construction" 7 "Wholesale and Retail Trade, Car and Motorcycle Repair" ///
    8 "Transportation and Warehousing" 9 "Hospitality and Restaurants" 10 "Infomation and Communication" 11 "Financial Services and Insurance" ///
    12 "Real Estate" 13 "Business Services" 14 "Government Administration, Defense and Social Security" 15 "Education" 16 "Health and Social Services" 17 "Other", replace
  gen business_field = r13a_kateg if employed == 1
  la val business_field business_fields
  tab business_field, m

* KBJI code (https://www.bps.go.id/website/fileMenu/KBJI-2014.pdf)
  // attempt to create two-digit KBJI code
  tab r13b_kji19, m
  tab r13b_kbji2, m
  la def occupations 0 "Army and Police" 1 "Manager" 2 "Professional" 3 "Technicians and Assistant Professionals" 4 "Administrative Personnel" 5 "Business Services and Sales Personnel" 6 "Skilled Workers in Agriculture, Forestry and Fisheries" 7 "Production, Craft, and Related Workers" 8 "Machine Operators and Assembly Workers" 9 "Blue-collar workers", replace
  gen occupation = r13b_kbji2 if employed == 1
  la val occupation occupations
  tab occupation, m


* wage last month (assuming this is July)
  summ r14a1 r14a2 if inlist(employment_status, 1, 4, 5, 6)
  tab r14a1 if r14a1 < 0
  // NOTE: there are 3 negatives. Since this includes some self-employed respondents, it's possible their costs exceed revenues. Keep for now

* Earnings not including business ownership
  gen earnings = r14a1 + r14a2 if inlist(employment_status, 1, 4, 5, 6)
  summ earnings if inlist(employment_status, 1, 4, 5, 6), d
  count if mi(earnings)

* Individual earnings, including zero if not employed
  gen ind_earnings = earnings
  replace ind_earnings = 0 if employed == 0
  sum ind_earnings, d

  gen ind_adj_earnings = ind_earnings
  sum ind_adj_earnings if ind_adj_earnings > 0, d
  replace ind_adj_earnings = r(p99) if ind_adj_earnings > r(p99) & !mi(ind_adj_earnings)
  replace ind_adj_earnings = r(p1) if ind_adj_earnings < r(p1) & !mi(ind_adj_earnings) & ind_adj_earnings != 0
  replace ind_adj_earnings = ind_adj_earnings/1000
  la var ind_adj_earnings "Adjusted Earnings (in 1000s of rupiah, winsorized of 1st and 99th pct)"
  sum ind_adj_earnings, d
  gen ind_pos_adj_earnings = ind_adj_earnings if ind_adj_earnings > 0

  gen ind_earnings_pos = ind_earnings > 0 if !mi(ind_earnings)

* hours worked last week
  summ r16a? if inlist(employment_status, 1, 4, 5, 6)
  summ r16a
  count if r16a > (7*14)
  hist r16a
  gen hours_worked_raw = r16a
  summ hours_worked_raw if hours_worked_raw != 0

* hours worked, truncated at 14 hours per day
* NOTE FROM MICHELLE: Check if the other SAKERNAS truncate hours worked for consistency
  summ r16a if inlist(employment_status, 1, 4, 5, 6)
  gen hours_worked = r16a
  replace hours_worked = 14*7 if r16a > 14*7 & r16a != .
  replace hours_worked = 0 if employed == 0
  summ hours_worked_raw hours_worked

* hourly wage (income per day / hours per day)
* raw wages and hours
  gen hourly_wage_raw = (earnings / 31) / (hours_worked_raw / 7)
  summ hourly_wage_raw, d

* Hourly wage, including business owners
* Income winsorized to 99th percentile
  gen hourly_wage = (ind_adj_earnings/31)/(hours_worked / 7) if hours_worked != 0
  replace hourly_wage = 0 if hours_worked == 0
  gen poswage = hourly_wage if hourly_wage > 0

* Winsorize 1% and 99% percentiles
  gen hourly_adj_wage = hourly_wage
  sum hourly_adj_wage if hourly_adj_wage > 0, d
  replace hourly_adj_wage = r(p99) if hourly_adj_wage > r(p99) & !mi(hourly_adj_wage)
  replace hourly_adj_wage = r(p1) if hourly_adj_wage < r(p1) & !mi(hourly_adj_wage) & hourly_adj_wage != 0
  la var hourly_adj_wage "Adjusted Hourly Wage (in 1000s of rupiah, winsorized of 1st and 99th pct)"
  sum hourly_adj_wage, d

* Positive wages
  gen poswage_adj = hourly_adj_wage if hourly_adj_wage > 0
  gen poswage_dummy = hourly_adj_wage > 0 if !mi(hourly_adj_wage)

* Replace wages as missing if no hours are worked
  gen hourly_wage_no0 = hourly_wage
  replace hourly_wage_no0 = . if hours_worked == 0

* Positive wages
  gen poswage_no0 = hourly_wage_no0 if hourly_wage_no0 > 0

* Winsorize 1% and 99% percentiles
  gen hourly_adj_wage_no0 = hourly_wage_no0
  sum hourly_adj_wage_no0 if hourly_adj_wage_no0 > 0, d
  replace hourly_adj_wage_no0 = r(p99) if hourly_adj_wage_no0 > r(p99) & !mi(hourly_adj_wage_no0)
  replace hourly_adj_wage_no0 = r(p1) if hourly_adj_wage_no0 < r(p1) & !mi(hourly_adj_wage_no0)
  replace hourly_adj_wage_no0 = hourly_adj_wage_no0/1000 
  la var hourly_adj_wage_no0 "Adjusted Hourly Wage (no zeroes, in 1000s of rupiah, winsorized of 1st and 99th pct)"
  sum hourly_adj_wage_no0, d
  gen poswage_adj_no0 = hourly_adj_wage_no0 if hourly_adj_wage_no0 > 0

* Household earnings
  bys hh_id_s: egen mi_ind_earnings = max(mi(ind_earnings))
  bys hh_id_s: egen hh_earnings = total(ind_earnings) if mi_ind_earnings ==0
	gen hh_pos_earnings = hh_earnings if hh_earnings > 0

	gen hh_adj_earnings = hh_earnings // no adjustment here?
  replace hh_adj_earnings = hh_adj_earnings / 1000 
  la var hh_adj_earnings "Adjusted Hourly Wage (in 1000s of rupiah)"  
	sum hh_adj_earnings, d
	gen hh_pos_adj_earnings = hh_adj_earnings if hh_adj_earnings > 0

	gen hh_earnings_pos = hh_earnings > 0 if !mi(hh_earnings)

  // Exchange rate: 14,403.60 rp/$
  // median monthly salary (assuming 40 hours/day, 5 days/week)
  di `r(p50)' * 4 * 5 * 8
  di `r(p50)' * 4 * 5 * 8 / 14400

* imputed wage (using occupation, field, province, age, and gender)
  summ hourly_wage
  tab occupation
  tab business_field
  tab kode_prov
  summ age_sak
  tab female
  reg hourly_wage i.occupation i.business_field i.kode_prov i.age_sak female, vce(robust)
  predict wage_impute if hourly_wage != .
  summ wage_impute hourly_wage
  corr wage_impute hourly_wage

* change in wage from before covid?
  tab r14b, m
  gen covid_wage_loss = r14b == 2

* work tenure
  tab r15a1, m
  tab r15a2, m // are the 9999s don't knows?
  tab r15a2 if r15a1 == 99
  tab r15a1 if r15a2 == 9999

  * start year
  gen year_tenure = r15a2
  replace year_tenure = . if year_tenure == 0 | year_tenure == 9999
  tab year_tenure if employed == 1, m

  * start month
  gen month_tenure = r15a1
  replace month_tenure = . if month_tenure == 0
  tab month_tenure if employed == 1, m

  // set to middle of year if month missing but year is not
  replace month_tenure = 6 if month_tenure == 99 & year_tenure != .
  replace month_tenure = . if month_tenure == 99 & year_tenure == .
  tab month_tenure if employed == 1, m

  * create start date
  gen start_date = ym(year_tenure, month_tenure)
  format start_date %tm
  summ start_date, format
  count if start_date == . & employed == 1

  * create tenure in months
  summ start_date, format
  count if start_date < tm(2020m8)
  gen tenure_months = tm(2020m8) - start_date
  summ tenure_months, d
  di `r(max)'/12
  count if tenure_months == . & employed == 1
  hist tenure_months

  * check
  gen age_months = 12 * age_sak
  // subtract 48 months to get number of months one has been at least 4 years old (approximately)
  replace age_months = age_months - 48
  // check that respondents do not report being at same job since from before they were 4 years old
  summ age_months
  cap noi assert tenure_months < age_months if tenure_months != .
  rename tenure_months tenure_months_raw
  gen tenure_months = tenure_months_raw
  replace tenure_months = . if tenure_months >= age_months
  drop year_tenure month_tenure age_months
  summ tenure_months*
  tab tenure_months if employed == 1, m

* use internet at work
  tab r17a, m
  // make missing if not working
  gen use_internet_work = r17a == 1 if r17a != 0
  replace use_internet_work = 0 if employed == 0 & mi(use_internet_work)
  tab use_internet_work

* had more than one job
  tab r20a, m
  gen multiple_jobs = r20a == 1
  tab multiple_jobs, m

* job search/business creation
  tab r22a, m
  gen looking_job = r22a == 1

  tab r22b, m
  gen prep_new_bus = r22b == 1

  gen activities_job = inlist(1, r24a, r24b, r24c, r24d, r24e, r24f)
  label var activities_job "Any effort to find a job"

    * Note: 1 = yes, 2 = no, 0 = not asked
  gen reg_job_mkt = r24a == 1
  gen contact_company = r24b == 1
  gen advert_online = r24c == 1
  gen search_network = r24d == 1
  gen cap_loc_lic = r24e == 1
  gen any_attempt = r24f == 1

  tab r25a, m

  tab employed, m
  gen labor_force = employed == 1 | looking_job == 1 | prep_new_bus == 1
  tab labor_force, m

* anyone in HH own business with employees?
  gen with_emp = inlist(employment_status, 2,3)

*bys hh_id: egen with_emp_HH = max(with_emp)
  bys hh_id_s: egen with_emp_HH_S = max(with_emp)

* Actively starting a new business (22B) --this we can do for anyone in household
  bys hh_id_s: egen prep_bus_hh = max(prep_new_bus)
  bys hh_id_s: egen hh_emp_self = max(self_employed)

/*----------------------------------------------------*/
            /* Section: Kartu Prakerja */
/*----------------------------------------------------*/

  * do you know of kartu prakerja
  tab r27a, m
  gen prakerja_familiar = r27a == 1

  * did you sign up for kartu prakerja?
  tab r27b, m
  gen report_applied = r27b == 1
  tab report_applied

  * main reason signed up for kartu prakerja
  tab r27c, m
  gen reason_prakerja = r27c if r27c != 0
  la def reasons 1 "Skills" 2 "Incentives" 3 "Fill spare time" 4 "Join friends/try repeatedly"  5 "Free registration" 6 "Other", replace
  la val reason_prakerja reasons
  tab reason_prakerja

  * selected for kartu prakerja
  tab r27d, m
  gen report_selected = r27d == 1
  tab report_applied report_selected

  * completed kartu prakerja training
  tab r27e, m
  gen complete_pk_train = r27e == 1
  tab complete_pk_train, m

  * kartu prakerja improved skills
  tab r27f, m
  gen pk_improve_skill = r27f == 1
  tab pk_improve_skill, m

  * did you receive kartu prakerja incentives?
  tab r27g, m
  gen receive_pk_aid = r27g == 1

  * what are KP benefits spent on?
  tab r27h1, m
  gen use_pk_aid_needs = r27h1 == 1 if r27h1 != 0

  tab r27h2, m
  gen use_pk_aid_business = r27h2 == 1 if r27h2 != 0

  tab r27h3, m
  gen use_pk_aid_debt = r27h3 == 1 if r27h3 != 0

  tab r27h4, m
  gen use_pk_aid_save = r27h4 == 1 if r27h4 != 0

  tab r27h5, m
  gen use_pk_aid_other = r27h5 == 1 if r27h5 != 0
  summ use_pk_aid*

* new job since the date you registered for Prakerja (since batch 1 will be fine) (we know date you started your job in 15a -- note this is only for main job)
* batch 1 starts 11 April 2020
  gen new_job = start_date - tm(2020m4) >= 0 if !mi(start_date)
  replace new_job = 0 if business_owner == 1 | employed == 0
  tab new_job, m

* new business since start of batch
  gen new_start = start_date - tm(2020m4) >= 0 if !mi(start_date)
  replace new_start = 0 if business_owner != 1 | employed == 0
  tab new_start, m

  tab r27i, m
  gen covid_program = r27i
	gen prog_subs = covid_program == 1
	gen prog_food_assist = covid_program == 2
	gen prog_direct_cash = covid_program == 3
	gen prog_uncond_assist = covid_program == 4
	gen prog_other = covid_program == 5

 ******************************************************
* check duplicates
	duplicates tag anon_id4, gen(anon_id4_dupe)
	tab anon_id4_dupe

* drop duplicates 
	drop if anon_id4_dupe != 0 & !missing(anon_id4)
	drop anon_id4_dupe 
  
/*----------------------------------------------------*/
               * Export cleaned data
/*----------------------------------------------------*/

* save
  datasignature 
  if "`r(datasignature)'" == "792602:270(107964):1730762563:2564019301" {
    save "$KP_deid_sakernas/Clean/sak_aug20_deid_clean.dta", replace
      }
  else {
    di as err "Careful, your machine produces a different dataset"
    stop
		}
  

/*----------------------------------------------------*/
            * Merge with admin data and export
/*----------------------------------------------------*/

  keep if !mi(userid_bps) | !mi(anon_id4)

  drop r? r?? r??? r???? k? klasifikas jlh_art* k5_* r7_* r13* r19* r29* date_incentive

  gen sak_round = 5

* merge
  preserve
	keep if !missing(anon_id4)
	tempfile sak_anon_id
	save `sak_anon_id'
  restore

  use "$KP_deid_admin/Clean/pmo_b1-22_clean_long_deid.dta", clear

  merge m:1 anon_id4 using `sak_anon_id', nogen keep(3)

	compress
  datasignature 
  if "`r(datasignature)'" == "190268:216(56063):2765470512:1546259147" {
    save "$KP_deid_sakernas/Clean/sak_aug20_deid_clean_merged.dta", replace
      }
  else {
    di as err "Careful, your machine produces a different dataset"
    stop
		}
	

  // done
